This report presents the results of our birth country imputation project for 48,031 records, with 18,814 missing birth countries successfully imputed using multiple reference sources and matching strategies.
# Calculate key metrics
total_imputed <- sum(imputation_summary$n[imputation_summary$imp_type != "given"])
success_rate <- round((total_imputed / n_missing) * 100, 1)
methods_used <- nrow(imputation_summary) - 1
# Create summary box
summary_stats <- data.frame(
Metric = c("Total Records", "Missing Birth Countries", "Successfully Imputed",
"Success Rate", "Imputation Methods"),
Value = c(formatC(n_total, format="d", big.mark=","),
formatC(n_missing, format="d", big.mark=","),
formatC(total_imputed, format="d", big.mark=","),
paste0(success_rate, "%"),
methods_used)
)
summary_stats %>%
gt() %>%
tab_header(title = "Imputation Summary Statistics") %>%
cols_align(align = "center", columns = Value) %>%
tab_style(
style = list(cell_fill(color = "#E8F4FD")),
locations = cells_body(rows = 4) # Highlight success rate
)
| Imputation Summary Statistics | |
| Metric | Value |
|---|---|
| Total Records | 48,031 |
| Missing Birth Countries | 18,814 |
| Successfully Imputed | 18,808 |
| Success Rate | 100% |
| Imputation Methods | 26 |
After solving multi-classed imputation we could improve algorithm in detecting unique country of birth.
# Static data
imp_birth_country_before <- data.frame(
country_code = c('000/152', '000/151', '000/160', '149/152', '000/124',
'000/142', '000/148', '000/368', '000/438', '000/451'),
n = c(169, 13, 12, 12, 2, 2, 2, 2, 2, 2)
)
imp_birth_country_after <- data.frame(
country_code = c('000/151', '000/152', '000/160', '149/152', '000/124',
'000/142', '000/148', '000/368', '000/438', '000/451'),
n = c(11, 7, 5, 4, 2, 2, 2, 2, 2, 2)
)
# Combine data
combined <- full_join(imp_birth_country_before, imp_birth_country_after, by = "country_code", suffix = c("_before", "_after")) %>%
replace_na(list(n_before = 0, n_after = 0)) %>%
pivot_longer(cols = c(n_before, n_after), names_to = "group", values_to = "count")
# Plot horizontal bars
ggplot(combined, aes(x = country_code, y = count, fill = group)) +
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_manual(values = c("n_before" = "skyblue", "n_after" = "salmon"),
labels = c("Before", "After")) +
labs(x = "Country Code", y = "Count", fill = "Group",
title = "Before vs After Multiclass Imputation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 1)) +
coord_flip()
library(shiny)
library(dplyr)
library(ggplot2)
library(forcats)
library(plotly)
library(RColorBrewer) # or viridis if installed
# Assume imputation_summary is already defined and n_missing calculated
n_missing <- sum(imputation_summary$n)
ui <- fluidPage(
titlePanel("Imputation Performance Explorer"),
sidebarLayout(
sidebarPanel(
sliderInput("threshold",
"Minimum Records to Display:",
min = 0,
max = 300,
value = 100,
step = 20)
),
mainPanel(
plotlyOutput("perf_plot")
)
)
)
server <- function(input, output, session) {
output$perf_plot <- renderPlotly({
# Filter dynamically based on slider
perf_data <- imputation_summary %>%
filter(imp_type != "given") %>%
filter(n > input$threshold) %>% # dynamic threshold
mutate(
pct = round(n / n_missing * 100, 1),
imp_type = fct_reorder(imp_type, n)
)
perf_plot <- ggplot(perf_data,
aes(x = imp_type, y = n, fill = imp_type,
text = paste0("Method: ", description,
"<br>Records: ", formatC(n, format="d", big.mark=","),
"<br>% of Missing: ", pct, "%"))) +
geom_col() +
coord_flip() +
scale_fill_brewer(palette = "Set2") + # built-in palette
labs(
title = "Records Imputed by Method",
x = "Imputation Method",
y = "Number of Records",
caption = "Hover for details"
) +
theme_minimal() +
theme(legend.position = "none")
ggplotly(perf_plot, tooltip = "text")
})
}
shinyApp(ui, server)
imputation_summary %>%
filter(imp_type != "given") %>%
mutate(
pct_missing = round(n / n_missing * 100, 1),
pct_total = round(n / n_total * 100, 1)
) %>%
select(Method = imp_type, Description = description,
Records = n, `% of Missing` = pct_missing, `% of Total` = pct_total) %>%
datatable(
options = list(
pageLength = 15,
dom = 'Bfrtip',
scrollX = TRUE
),
caption = "Detailed breakdown of imputation methods (click column headers to sort)"
) %>%
formatStyle(
'Records',
background = styleColorBar(range(imputation_summary$n), 'lightblue'),
backgroundSize = '100% 90%',
backgroundRepeat = 'no-repeat',
backgroundPosition = 'center'
)
# Interactive pie chart
pie_plot <- country_dist %>%
plot_ly(labels = ~country, values = ~n, type = 'pie',
textposition = 'inside',
textinfo = 'label+percent',
hovertemplate = paste('<b>%{label}</b><br>',
'Records: %{value:,}<br>',
'Percentage: %{percent}<br>',
'<extra></extra>'),
marker = list(colors = RColorBrewer::brewer.pal(8, "Set2"))) %>%
layout(title = "Distribution of Imputed Birth Countries",
showlegend = TRUE)
pie_plot
library(dplyr)
library(DT)
sample_results <- data_filled_df %>%
# Optional: take a sample of 1000 rows if your dataset is large
slice_sample(n = min(1000, nrow(data_filled_df))) %>%
# Join readable description for imp_type
left_join(imputation_summary %>% select(imp_type, description), by = "imp_type") %>%
# Select & rename columns for display
select(
ID = pid,
`Birth City` = birth_city,
`Imputed Country` = imp_name,
`Method Used` = description,
`Citizenship 1` = citizenship_1,
`Citizenship 2` = citizenship_2
)
# 3. Display interactive table
sample_results %>%
datatable(
filter = 'top',
options = list(
pageLength = 25,
scrollX = TRUE, # allow horizontal scroll
autoWidth = TRUE, # adjust column widths automatically
dom = 'Bfrtip'
),
caption = "Sample of imputation results - Use filters above columns to explore patterns"
)
#
# # Create sample of final results for exploration
# # Replace this with your actual data_filled_df
# sample_results <- data.frame(
# pid = 1:1000,
# birth_city = sample(c("BERLIN", "HAMBURG", "ISTANBUL", "WARSZAWA", "ROMA", "PARIS", "LONDON", NA), 1000, replace = TRUE),
# imp_birth_country = sample(c("000", "152", "163", "380", "826"), 1000, replace = TRUE),
# imp_name = sample(c("Germany", "Poland", "Turkey", "Italy", "United Kingdom"), 1000, replace = TRUE),
# imp_type = sample(imputation_summary$imp_type, 1000, replace = TRUE, prob = imputation_summary$n),
# citizenship_1 = sample(c("000", "152", "163", "380", "826", NA), 1000, replace = TRUE),
# citizenship_2 = sample(c("000", "152", "163", "380", "826", NA), 1000, replace = TRUE)
# )
#
# sample_results %>%
# select(ID = pid, `Birth City` = birth_city, `Imputed Country` = imp_name,
# `Method Used` = imp_type, `Citizenship 1` = citizenship_1, `Citizenship 2` = citizenship_2) %>%
# datatable(
# filter = 'top',
# options = list(
# pageLength = 25,
# scrollX = TRUE,
# dom = 'Bfrtip'
# ),
# caption = "Sample of imputation results - Use filters above columns to explore patterns"
# )
library(ggplot2)
library(plotly)
library(stringr)
validation_plot <- validation_data %>%
mutate(Category = str_wrap(Category, 25)) %>%
ggplot(aes(x = reorder(Category, Count), y = Count, fill = Category,
text = paste0("Category: ", Category,
"<br>Count: ", formatC(Count, format="d", big.mark=","),
"<br>Percentage: ", Percentage, "%"))) +
geom_col() +
coord_flip() +
scale_fill_brewer(type = "qual", palette = "Set2") +
labs(title = "Validation: Birth Country vs Citizenship Consistency",
x = "",
y = "Number of Records") +
theme_minimal() +
theme(legend.position = "none")
ggplotly(validation_plot, tooltip = "text")
Our imputation strategy employed a hierarchical approach:
This report was generated using R Markdown with interactive elements. All charts are interactive - hover for details and use table filters to explore the data.